Problem 1

library(readr)
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 3.4.4
day <- read_csv("cabi.csv")
## Warning: Missing column names filled in: 'X1' [1]
## Parsed with column specification:
## cols(
##   X1 = col_integer(),
##   season = col_integer(),
##   year = col_integer(),
##   month = col_integer(),
##   wday = col_integer(),
##   hr = col_integer(),
##   temp = col_double(),
##   atemp = col_double(),
##   hum = col_double(),
##   windspeed = col_double(),
##   weather = col_integer(),
##   casual = col_integer(),
##   registered = col_integer()
## )
day$X1 <- NULL

# a

ggplot(day, aes(x = as.factor(year), y = casual)) + geom_boxplot() + 
  labs(title="Boxplot of Causal Riders", 
           subtitle="Variation by Year",
           x = "Year",
           y = "Number of Riders") + scale_x_discrete(labels=c("2011", "2012"))

ggplot(day, aes(x = as.factor(year), y = registered)) + geom_boxplot() + 
  labs(title="Boxplot of Registered Riders", 
           subtitle="Variation by Year",
           x = "Year",
           y = "Number of Riders") + scale_x_discrete(labels=c("2011", "2012"))

ggplot(day, aes(x = as.factor(month), y = casual)) + geom_boxplot() + 
  labs(title="Casual Riders", 
           subtitle="Variation by Month",
           x = "Month",
           y = "Number of Riders") +
  scale_x_discrete(labels=c("Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug", "Sep", "Oct",
                            "Nov", "Dec"))

ggplot(day, aes(x = as.factor(month), y = registered)) + geom_boxplot() + 
  labs(title="Registered Riders", 
           subtitle="Variation by Month",
           x = "Month",
           y = "Number of Riders") +
  scale_x_discrete(labels=c("Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug", "Sep", "Oct",
                            "Nov", "Dec"))

ggplot(day, aes(x = (hr + 1), y = casual)) + geom_bar(stat = "identity") + 
  labs(title="Casual Riders", 
           subtitle="Variation by Hour of Day",
           x = "Hour",
           y = "Number of Riders") +
  scale_x_continuous(limits=c(1, 24)) +
  scale_y_continuous(limits=c(0, 300000))

ggplot(day, aes(x = (hr + 1), y = registered)) + geom_bar(stat = "identity") + 
  labs(title="Registered Riders", 
           subtitle="Variation by Hour of Day",
           x = "Hour",
           y = "Number of Riders") +
  scale_x_continuous(limits=c(1, 24)) +
  scale_y_continuous(limits=c(0, 300000))

# b

ggplot(day, aes(x = casual, y = registered)) + geom_jitter(aes(colour = as.factor(year))) +
  geom_smooth(aes(colour = as.factor(year))) + scale_color_manual(labels = c("2011", "2012"), values = c("blue", "red")) + scale_x_continuous(limits = c(0,1000)) + scale_y_continuous(limits = c(0,1000))
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## Warning: Removed 791 rows containing missing values (geom_point).

ggplot(day, aes(x = casual, y = registered)) + geom_jitter(aes(colour = as.factor(wday))) +
  geom_smooth(aes(colour = as.factor(wday))) + scale_color_manual(labels = c("Not a working day", "Working Day"), values = c("blue", "red")) + scale_x_continuous(limits = c(0,1000)) + scale_y_continuous(limits = c(0,1000))
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## Warning: Removed 789 rows containing missing values (geom_point).

# c

ggplot(day, aes(x = as.factor(weather), y = casual)) + geom_bar(stat = "identity") +
  labs(title="Casual Riders", 
           subtitle="Variation by Weather",
           x = "Weather Type",
           y = "Number of Riders") +
  scale_x_discrete(labels=c("Clear", "Cloudy/Fog", "Rain/Snow/Thunder"))

ggplot(day, aes(x = as.factor(weather), y = registered)) + geom_bar(stat = "identity") +
  labs(title="Registered Riders", 
           subtitle="Variation by Weather",
           x = "Weather Type",
           y = "Number of Riders") +
  scale_x_discrete(labels=c("Clear", "Cloudy/Fog", "Rain/Snow/Thunder"))

# d

ggplot(day, aes(x = (hr + 1), y = casual, fill = as.factor(weather))) + 
  geom_bar(stat = "identity") + scale_fill_manual(labels = c("Clear", "Cloudy/Fog", "Rain/Snow/Thunder"), values = c("#42a7f4", "red", "green")) + labs(title="Casual Riders", 
           subtitle="Variation by Time of Day and Weather Type",
           x = "Hour of Day",
           y = "Number of Riders")

ggplot(day, aes(x = (hr + 1), y = registered, fill = as.factor(weather))) + geom_bar(stat = "identity") +  scale_fill_manual(labels = c("Clear", "Cloudy/Fog", "Rain/Snow/Thunder"), values = c("#42a7f4", "red", "green"))  + labs(title="Registered Riders", 
           subtitle="Variation by Time of Day and Weather Type",
           x = "Hour of Day",
           y = "Number of Riders")

ggplot(day, aes(x = as.factor(month), y = casual, fill = as.factor(weather))) + 
  geom_bar(stat = "identity") + scale_fill_manual(labels = c("Clear", "Cloudy/Fog", "Rain/Snow/Thunder"), values = c("#42a7f4", "red", "green")) + labs(title="Casual Riders", 
           subtitle="Variation by Time of Day and Weather Type",
           x = "Month",
           y = "Number of Riders")

ggplot(day, aes(as.factor(month), y = registered, fill = as.factor(weather))) + geom_bar(stat = "identity") + scale_fill_manual(labels = c("Clear", "Cloudy/Fog", "Rain/Snow/Thunder"), values = c("#42a7f4", "red", "green")) + labs(title="Registered Riders", 
           subtitle="Variation by Time of Day and Weather Type",
           x = "Month",
           y = "Number of Riders")

(a). By comparing the boxplots we can see the number of casual riders (e.g. tourists) inreased from 2011 to 2012 although the median value for both years was not hugely different. However 2012 had more outlier days with a high number of riders.

The number of registered riders increased more significantly, implying that more people began to use the Capital Bikeshare service to commute around the city in place of public transport/private vehicles.

For both casual and registered riders there are a greater number of rides during the summer months, which makes sense as it is more comfortable to bike around in the city as compared to winter. The number of registered users is higher in winter as compared to casual riders, though this might just be because fewer tourists come to DC in the cold.

Finally we see that registered riders usually peak around office commuting hours (e.g. before 9am and after 5pm) which shows that a lot of these users are going to and from work. On the other hand for casual riders the peak is around late afternoon/evening when most tourists would be sightseeing etc.

(b). There seems to be a positive relationship between registered users and casual users, implying that other factors, such as weather seem to affect both similarly. On a clear ay more riders will be using the service, both casual and registered. The effect does not vary as much year, as both curves follow a similar trajectory.

However the effect does vary by working day.In case of a working day we see there tend to be more registered users, which shows that these people need to get to work whereas for casual users the increase is much gentler implying lesser urgency.

(c). There is a strong relationship between the weather situation and ridership counts. Biking seems to be an activity strongly associated with the weather conditions as many individuals choose not to bike when it is rainy or snowing. This holds true for both casual and registered riders. There is a steep drop in ridership from clear to cloudy, and there are hardly any riders during the snow/rain.

(d). We notice that ridership is almost zero if the weather is snowy/raining in the morning or late at night, for both casual and registered readers. However there are a few riders during the middle of the day, although it is larger for registered riders.

There is a slightly larger number of riders in the summer months for rainy/snowy weather as compared to the winter.

Problem 2

train1 <- sample(nrow(day),(nrow(day) * .70), replace = FALSE)
day_train <- day[train1,]
day_test <- day[-train1,]


# a

mlr_reg <- lm(registered ~ factor(season) + year + month + wday + hr + temp + atemp + hum + windspeed +
                factor(weather), data = day_train)
summary(mlr_reg)
## 
## Call:
## lm(formula = registered ~ factor(season) + year + month + wday + 
##     hr + temp + atemp + hum + windspeed + factor(weather), data = day_train)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -287.68  -77.97  -25.75   46.67  616.56 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      -1.433e+05  4.523e+03 -31.693  < 2e-16 ***
## factor(season)2   1.568e+01  4.093e+00   3.830 0.000129 ***
## factor(season)3   6.751e+00  5.809e+00   1.162 0.245210    
## factor(season)4   5.276e+01  5.764e+00   9.153  < 2e-16 ***
## year              7.125e+01  2.248e+00  31.693  < 2e-16 ***
## month             4.720e-01  6.083e-01   0.776 0.437811    
## wday              4.028e+01  2.403e+00  16.763  < 2e-16 ***
## hr                6.370e+00  1.733e-01  36.750  < 2e-16 ***
## temp              1.314e+02  3.905e+01   3.364 0.000770 ***
## atemp             1.008e+02  4.220e+01   2.388 0.016933 *  
## hum              -1.231e+02  7.211e+00 -17.064  < 2e-16 ***
## windspeed         3.289e+01  9.992e+00   3.292 0.000998 ***
## factor(weather)2  9.198e+00  2.727e+00   3.373 0.000745 ***
## factor(weather)3 -2.832e+01  4.604e+00  -6.151 7.92e-10 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 123.1 on 12151 degrees of freedom
## Multiple R-squared:  0.3409, Adjusted R-squared:  0.3402 
## F-statistic: 483.5 on 13 and 12151 DF,  p-value: < 2.2e-16
# b

predicted_reg <- predict(mlr_reg, day_test, interval = "confidence")
mean((predicted_reg - day_test$registered)^2)
## [1] 15008.7
# c

mean(mlr_reg$residuals^2)
## [1] 15128.27
mlr_reg3 <- lm(registered ~ factor(season) + year + factor(month) + wday + hr + temp + atemp + hum + windspeed + factor(weather), data = day_train)
summary(mlr_reg3)
## 
## Call:
## lm(formula = registered ~ factor(season) + year + factor(month) + 
##     wday + hr + temp + atemp + hum + windspeed + factor(weather), 
##     data = day_train)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -277.50  -77.45  -25.79   45.93  616.36 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      -1.414e+05  4.517e+03 -31.304  < 2e-16 ***
## factor(season)2   3.700e+01  7.051e+00   5.248 1.57e-07 ***
## factor(season)3   2.421e+01  8.312e+00   2.913 0.003588 ** 
## factor(season)4   6.347e+01  7.085e+00   8.957  < 2e-16 ***
## year              7.029e+01  2.245e+00  31.306  < 2e-16 ***
## factor(month)2   -4.139e+00  5.651e+00  -0.732 0.463919    
## factor(month)3   -1.368e+01  6.286e+00  -2.176 0.029576 *  
## factor(month)4   -3.480e+01  9.317e+00  -3.735 0.000189 ***
## factor(month)5   -2.717e+01  9.931e+00  -2.736 0.006236 ** 
## factor(month)6   -4.663e+01  1.011e+01  -4.613 4.01e-06 ***
## factor(month)7   -5.298e+01  1.141e+01  -4.644 3.45e-06 ***
## factor(month)8   -2.569e+01  1.117e+01  -2.300 0.021477 *  
## factor(month)9   -3.155e+00  1.001e+01  -0.315 0.752514    
## factor(month)10  -1.539e+01  9.335e+00  -1.648 0.099348 .  
## factor(month)11  -2.463e+01  8.998e+00  -2.737 0.006201 ** 
## factor(month)12  -5.527e+00  7.221e+00  -0.765 0.444007    
## wday              3.992e+01  2.401e+00  16.627  < 2e-16 ***
## hr                6.186e+00  1.748e-01  35.391  < 2e-16 ***
## temp              1.503e+02  4.080e+01   3.683 0.000231 ***
## atemp             1.118e+02  4.263e+01   2.623 0.008724 ** 
## hum              -1.313e+02  7.351e+00 -17.862  < 2e-16 ***
## windspeed         3.062e+01  1.005e+01   3.046 0.002328 ** 
## factor(weather)2  7.895e+00  2.730e+00   2.892 0.003835 ** 
## factor(weather)3 -2.813e+01  4.593e+00  -6.124 9.40e-10 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 122.6 on 12141 degrees of freedom
## Multiple R-squared:  0.3465, Adjusted R-squared:  0.3452 
## F-statistic: 279.8 on 23 and 12141 DF,  p-value: < 2.2e-16
# d

day_train2 <- day_train
day_test2 <- day_test

day_train2$hr <-  as.factor(day_train2$hr)
day_test2$hr <- as.factor(day_test2$hr)

mlr_reg2 <- lm(registered ~ factor(season) + year + month + wday + factor(hr) + temp + atemp + hum + windspeed + factor(weather), data = day_train)
summary(mlr_reg2)
## 
## Call:
## lm(formula = registered ~ factor(season) + year + month + wday + 
##     factor(hr) + temp + atemp + hum + windspeed + factor(weather), 
##     data = day_train)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -343.96  -48.94   -6.37   44.44  440.94 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      -1.478e+05  3.166e+03 -46.690  < 2e-16 ***
## factor(season)2   3.103e+01  2.898e+00  10.708  < 2e-16 ***
## factor(season)3   3.224e+01  4.134e+00   7.800 6.69e-15 ***
## factor(season)4   6.184e+01  4.032e+00  15.338  < 2e-16 ***
## year              7.347e+01  1.574e+00  46.679  < 2e-16 ***
## month            -3.997e-01  4.253e-01  -0.940 0.347250    
## wday              4.166e+01  1.679e+00  24.808  < 2e-16 ***
## factor(hr)1      -8.209e+00  5.397e+00  -1.521 0.128272    
## factor(hr)2      -1.969e+01  5.474e+00  -3.598 0.000322 ***
## factor(hr)3      -2.720e+01  5.435e+00  -5.005 5.67e-07 ***
## factor(hr)4      -3.307e+01  5.390e+00  -6.135 8.76e-10 ***
## factor(hr)5      -1.603e+01  5.439e+00  -2.947 0.003213 ** 
## factor(hr)6       3.752e+01  5.424e+00   6.917 4.85e-12 ***
## factor(hr)7       1.696e+02  5.391e+00  31.457  < 2e-16 ***
## factor(hr)8       3.012e+02  5.370e+00  56.082  < 2e-16 ***
## factor(hr)9       1.451e+02  5.325e+00  27.253  < 2e-16 ***
## factor(hr)10      7.783e+01  5.422e+00  14.354  < 2e-16 ***
## factor(hr)11      9.655e+01  5.409e+00  17.852  < 2e-16 ***
## factor(hr)12      1.259e+02  5.536e+00  22.738  < 2e-16 ***
## factor(hr)13      1.216e+02  5.498e+00  22.123  < 2e-16 ***
## factor(hr)14      1.016e+02  5.443e+00  18.666  < 2e-16 ***
## factor(hr)15      1.144e+02  5.492e+00  20.832  < 2e-16 ***
## factor(hr)16      1.749e+02  5.483e+00  31.895  < 2e-16 ***
## factor(hr)17      3.293e+02  5.501e+00  59.871  < 2e-16 ***
## factor(hr)18      3.103e+02  5.464e+00  56.793  < 2e-16 ***
## factor(hr)19      2.104e+02  5.458e+00  38.543  < 2e-16 ***
## factor(hr)20      1.384e+02  5.383e+00  25.718  < 2e-16 ***
## factor(hr)21      9.823e+01  5.370e+00  18.293  < 2e-16 ***
## factor(hr)22      6.376e+01  5.363e+00  11.889  < 2e-16 ***
## factor(hr)23      3.285e+01  5.380e+00   6.105 1.06e-09 ***
## temp              9.284e+01  2.736e+01   3.394 0.000692 ***
## atemp             6.491e+01  2.949e+01   2.201 0.027766 *  
## hum              -4.561e+01  5.496e+00  -8.298  < 2e-16 ***
## windspeed        -1.266e+01  7.016e+00  -1.805 0.071147 .  
## factor(weather)2 -5.312e+00  1.936e+00  -2.744 0.006075 ** 
## factor(weather)3 -5.310e+01  3.275e+00 -16.214  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 85.96 on 12129 degrees of freedom
## Multiple R-squared:  0.679,  Adjusted R-squared:  0.6781 
## F-statistic: 733.1 on 35 and 12129 DF,  p-value: < 2.2e-16
predicted_reg2 <- predict(mlr_reg2, day_test, interval = "confidence")
mean((predicted_reg2 - day_test$registered)^2)
## [1] 7442.71

(a). From our regression output we see that the variables season (for categories 2 and 4), year, wday, hr, tmemp and hum are highly significant (p-value much lower than 0.001). Both categories for the weather variable are also significant at the 1% level along with windspeed . The atemp variable is significant at the 5% level.

This significance level implies that all of these variables have a beta coefficient which is significantly different from zero and we can interpret the effect of these coefficients. For example, the “1.370e+02” coefficient on the temp variable implies that a 1 unit increase in temperature is associated with an increase of 137 registered riders per hour holding all other variables constant.

We see that the the number of registered riders increases on a working day, if the year is 2012, if the temperature and windspeed increase. Similarly there is a negative relationship between registered riders and inreasing humidity, and if the weather is rainy or snowing.

These results make sense as registered riders are often those that use the bikes to commute to and from work, but might look at alternatives in case of adverse weather.

(b). The RMSE value is 15032.7.

(c).

(d). We see that Adj R squared value almost doubles if we include the hour variable as a factor instead of as a numeric. Howevere the test RMSE value also falls to 7448.248, which indicates the improvement is not just for the training data but also the test data.

This is because incluing the hr variable as numeric implies that there is an increasing relationship within our variable with hr 23 being greater than hr 22, which is not accurate. For time periods although one might occur later, there is no independant magnitude which would be greater for one category or another. This is why we should include the hr variable as factor.

Problem 3

library(pROC)
## Warning: package 'pROC' was built under R version 3.4.4
## Type 'citation("pROC")' for a citation.
## 
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var
library(nnet)

nnet_MSE <- vector()

for(i in 1:10){
  reg_nnet1 <- nnet(registered/1000 ~ temp + atemp + hum + windspeed + factor(weather), data = day_train, size = (2*i), maxit = 1000, decay = 0.1)

  predict1_reg <- predict(reg_nnet1, day_test, type = "raw") * 1000
  nnet_MSE[i] <- mean((predict1_reg - day_test$registered)^2)
  
}
## # weights:  17
## initial  value 1675.853583 
## iter  10 value 268.599038
## iter  20 value 231.213600
## iter  30 value 229.899702
## iter  40 value 229.784455
## iter  50 value 229.769561
## iter  60 value 229.763914
## iter  70 value 229.032319
## iter  80 value 228.721037
## final  value 228.690161 
## converged
## # weights:  33
## initial  value 1795.626896 
## iter  10 value 351.614877
## iter  20 value 237.027525
## iter  30 value 231.120209
## iter  40 value 230.515734
## iter  50 value 229.416492
## iter  60 value 228.789027
## iter  70 value 228.629060
## iter  80 value 228.537359
## iter  90 value 228.475042
## iter 100 value 228.455153
## iter 110 value 228.439874
## iter 120 value 228.436346
## iter 130 value 228.432900
## final  value 228.432165 
## converged
## # weights:  49
## initial  value 2099.166275 
## iter  10 value 407.358969
## iter  20 value 235.320337
## iter  30 value 230.689236
## iter  40 value 229.291453
## iter  50 value 228.655584
## iter  60 value 228.488306
## iter  70 value 228.458036
## iter  80 value 228.437318
## iter  90 value 228.434733
## iter 100 value 228.432588
## iter 110 value 228.432067
## iter 120 value 228.428298
## iter 130 value 228.419965
## iter 140 value 228.414935
## iter 150 value 228.413211
## iter 160 value 228.412610
## final  value 228.412489 
## converged
## # weights:  65
## initial  value 2872.425309 
## iter  10 value 409.754883
## iter  20 value 238.143274
## iter  30 value 231.361011
## iter  40 value 229.084033
## iter  50 value 228.672595
## iter  60 value 228.548998
## iter  70 value 228.508985
## iter  80 value 228.492641
## iter  90 value 228.475269
## iter 100 value 228.470576
## iter 110 value 228.468222
## iter 120 value 228.466297
## iter 130 value 228.464163
## iter 140 value 228.458402
## iter 150 value 228.446421
## iter 160 value 228.436628
## iter 170 value 228.433387
## final  value 228.433185 
## converged
## # weights:  81
## initial  value 5310.234521 
## iter  10 value 602.528340
## iter  20 value 249.478824
## iter  30 value 234.018369
## iter  40 value 230.473177
## iter  50 value 229.807436
## iter  60 value 229.232351
## iter  70 value 228.894520
## iter  80 value 228.724806
## iter  90 value 228.615850
## iter 100 value 228.573740
## iter 110 value 228.546755
## iter 120 value 228.517395
## iter 130 value 228.496754
## iter 140 value 228.482446
## iter 150 value 228.472914
## iter 160 value 228.467285
## iter 170 value 228.462101
## iter 180 value 228.455453
## iter 190 value 228.430348
## iter 200 value 228.421417
## iter 210 value 228.412573
## iter 220 value 228.407194
## iter 230 value 228.404946
## iter 240 value 228.403798
## final  value 228.403776 
## converged
## # weights:  97
## initial  value 1831.686640 
## iter  10 value 555.106860
## iter  20 value 279.233461
## iter  30 value 239.790060
## iter  40 value 233.660189
## iter  50 value 231.627258
## iter  60 value 230.720342
## iter  70 value 230.271586
## iter  80 value 229.801900
## iter  90 value 229.394380
## iter 100 value 229.088192
## iter 110 value 228.936604
## iter 120 value 228.818361
## iter 130 value 228.717980
## iter 140 value 228.591154
## iter 150 value 228.498441
## iter 160 value 228.452300
## iter 170 value 228.433040
## iter 180 value 228.423703
## iter 190 value 228.419080
## iter 200 value 228.414043
## iter 210 value 228.402723
## iter 220 value 228.391760
## iter 230 value 228.388673
## iter 240 value 228.387435
## iter 250 value 228.386966
## final  value 228.386885 
## converged
## # weights:  113
## initial  value 969.872339 
## iter  10 value 448.144864
## iter  20 value 237.243674
## iter  30 value 230.607019
## iter  40 value 229.278978
## iter  50 value 228.893954
## iter  60 value 228.686854
## iter  70 value 228.534759
## iter  80 value 228.479707
## iter  90 value 228.458600
## iter 100 value 228.437948
## iter 110 value 228.409073
## iter 120 value 228.399894
## iter 130 value 228.393517
## iter 140 value 228.389318
## iter 150 value 228.387697
## iter 160 value 228.386907
## iter 170 value 228.386387
## iter 180 value 228.386208
## iter 190 value 228.385853
## iter 200 value 228.385559
## iter 210 value 228.385272
## iter 220 value 228.385063
## iter 230 value 228.384928
## final  value 228.384706 
## converged
## # weights:  129
## initial  value 624.675306 
## iter  10 value 319.304215
## iter  20 value 232.442131
## iter  30 value 229.284033
## iter  40 value 228.612203
## iter  50 value 228.447207
## iter  60 value 228.418532
## iter  70 value 228.404752
## iter  80 value 228.393565
## iter  90 value 228.389538
## iter 100 value 228.385678
## iter 110 value 228.383310
## iter 120 value 228.382176
## iter 130 value 228.381412
## iter 140 value 228.380536
## iter 150 value 228.380042
## iter 160 value 228.379853
## iter 170 value 228.379718
## iter 180 value 228.379582
## iter 190 value 228.379428
## iter 200 value 228.379317
## iter 210 value 228.379141
## final  value 228.379128 
## converged
## # weights:  145
## initial  value 5513.409489 
## iter  10 value 598.136681
## iter  20 value 247.071094
## iter  30 value 233.232057
## iter  40 value 230.113529
## iter  50 value 229.279411
## iter  60 value 228.847895
## iter  70 value 228.666974
## iter  80 value 228.552384
## iter  90 value 228.453861
## iter 100 value 228.429803
## iter 110 value 228.412610
## iter 120 value 228.402055
## iter 130 value 228.395951
## iter 140 value 228.390529
## iter 150 value 228.388155
## iter 160 value 228.386200
## iter 170 value 228.384100
## iter 180 value 228.382972
## iter 190 value 228.381984
## iter 200 value 228.381189
## iter 210 value 228.380831
## iter 220 value 228.380349
## iter 230 value 228.379244
## iter 240 value 228.378608
## iter 250 value 228.378111
## iter 260 value 228.377504
## iter 270 value 228.376991
## iter 280 value 228.376721
## iter 290 value 228.376631
## iter 300 value 228.376529
## iter 310 value 228.376317
## iter 320 value 228.375820
## final  value 228.375802 
## converged
## # weights:  161
## initial  value 2306.219567 
## iter  10 value 598.910138
## iter  20 value 259.437874
## iter  30 value 234.071219
## iter  40 value 230.926099
## iter  50 value 230.186905
## iter  60 value 229.773685
## iter  70 value 229.123673
## iter  80 value 228.959685
## iter  90 value 228.804903
## iter 100 value 228.734505
## iter 110 value 228.663298
## iter 120 value 228.627456
## iter 130 value 228.595155
## iter 140 value 228.553885
## iter 150 value 228.522855
## iter 160 value 228.496380
## iter 170 value 228.455721
## iter 180 value 228.429146
## iter 190 value 228.421320
## iter 200 value 228.415367
## iter 210 value 228.412032
## iter 220 value 228.408500
## iter 230 value 228.405030
## iter 240 value 228.401535
## iter 250 value 228.395979
## iter 260 value 228.392155
## iter 270 value 228.388526
## iter 280 value 228.386922
## iter 290 value 228.385177
## iter 300 value 228.384460
## iter 310 value 228.384100
## iter 320 value 228.383679
## iter 330 value 228.383407
## final  value 228.383288 
## converged
nnet_MSE
##  [1] 18264.38 18222.89 18219.15 18215.09 18213.65 18214.21 18213.63
##  [8] 18212.47 18211.64 18210.81
nnet_MSE2 <- vector()

for(i in 1:10){
  reg_nnet2 <- nnet(registered/1000 ~ factor(season) + year + factor(month) + wday + factor(hr), data = day_train, size = (2*i), maxit = 1000, decay = 0.1)

  predict2_reg <- predict(reg_nnet2, day_test, type ="raw") * 1000
  nnet_MSE2[i] <- mean((predict2_reg - day_test$registered)^2)
  
}
## # weights:  83
## initial  value 1939.542557 
## iter  10 value 591.693955
## iter  20 value 279.235340
## iter  30 value 265.290538
## iter  40 value 150.407739
## iter  50 value 107.298722
## iter  60 value 102.662679
## iter  70 value 101.786570
## iter  80 value 101.507340
## iter  90 value 101.465131
## iter 100 value 97.494341
## iter 110 value 82.157325
## iter 120 value 78.314104
## iter 130 value 77.537793
## iter 140 value 77.315956
## iter 150 value 77.128635
## iter 160 value 76.783126
## iter 170 value 76.673205
## iter 180 value 76.672201
## final  value 76.672191 
## converged
## # weights:  165
## initial  value 1302.124488 
## iter  10 value 302.747572
## iter  20 value 279.323194
## iter  30 value 279.269871
## iter  40 value 277.472565
## iter  50 value 261.106766
## iter  60 value 124.485937
## iter  70 value 108.912464
## iter  80 value 105.839188
## iter  90 value 102.421934
## iter 100 value 101.357195
## iter 110 value 101.317162
## iter 120 value 101.310825
## iter 130 value 101.310070
## iter 140 value 101.293202
## iter 150 value 101.059282
## iter 160 value 100.462739
## iter 170 value 97.028932
## iter 180 value 89.297669
## iter 190 value 81.349514
## iter 200 value 77.303062
## iter 210 value 76.328034
## iter 220 value 75.909786
## iter 230 value 75.529607
## iter 240 value 75.157328
## iter 250 value 74.863718
## iter 260 value 74.695440
## iter 270 value 74.599103
## iter 280 value 74.535897
## iter 290 value 74.504359
## iter 300 value 74.495457
## iter 310 value 74.491632
## iter 320 value 74.490761
## iter 330 value 74.490450
## iter 340 value 74.490124
## iter 350 value 74.489123
## iter 360 value 74.487224
## iter 370 value 74.486294
## iter 380 value 74.485882
## iter 390 value 74.485807
## final  value 74.485802 
## converged
## # weights:  247
## initial  value 1572.139122 
## iter  10 value 464.453853
## iter  20 value 279.377047
## iter  30 value 279.339578
## iter  40 value 271.457206
## iter  50 value 145.617987
## iter  60 value 117.544149
## iter  70 value 107.856451
## iter  80 value 104.340810
## iter  90 value 103.018315
## iter 100 value 100.824406
## iter 110 value 88.131605
## iter 120 value 80.393060
## iter 130 value 78.949832
## iter 140 value 77.816632
## iter 150 value 76.930536
## iter 160 value 76.695898
## iter 170 value 76.582831
## iter 180 value 76.298330
## iter 190 value 76.044900
## iter 200 value 75.976414
## iter 210 value 75.972840
## iter 220 value 75.972486
## iter 230 value 75.972402
## iter 240 value 75.972360
## final  value 75.972095 
## converged
## # weights:  329
## initial  value 2889.035097 
## iter  10 value 305.725490
## iter  20 value 279.276063
## iter  30 value 279.273827
## iter  40 value 268.736290
## iter  50 value 126.894475
## iter  60 value 113.185060
## iter  70 value 105.422161
## iter  80 value 101.027774
## iter  90 value 96.977445
## iter 100 value 92.117469
## iter 110 value 84.747706
## iter 120 value 80.441749
## iter 130 value 78.364698
## iter 140 value 77.198279
## iter 150 value 76.876970
## iter 160 value 76.585014
## iter 170 value 75.749262
## iter 180 value 75.253763
## iter 190 value 75.027030
## iter 200 value 74.900812
## iter 210 value 74.831110
## iter 220 value 74.737075
## iter 230 value 74.599880
## iter 240 value 74.431332
## iter 250 value 74.355942
## iter 260 value 74.320633
## iter 270 value 74.299944
## iter 280 value 74.291012
## iter 290 value 74.284789
## iter 300 value 74.280283
## iter 310 value 74.277607
## iter 320 value 74.270934
## iter 330 value 74.254071
## iter 340 value 74.242653
## iter 350 value 74.234506
## iter 360 value 74.225584
## iter 370 value 74.209666
## iter 380 value 74.195302
## iter 390 value 74.183845
## iter 400 value 74.170661
## iter 410 value 74.144997
## iter 420 value 74.094548
## iter 430 value 74.056502
## iter 440 value 74.038405
## iter 450 value 74.032448
## iter 460 value 74.030241
## iter 470 value 74.028787
## iter 480 value 74.027660
## iter 490 value 74.026896
## iter 500 value 74.025457
## iter 510 value 74.024232
## iter 520 value 74.023877
## iter 530 value 74.023804
## final  value 74.023795 
## converged
## # weights:  411
## initial  value 3642.416982 
## iter  10 value 620.759980
## iter  20 value 282.187633
## iter  30 value 279.317854
## iter  40 value 279.284241
## iter  50 value 279.255793
## iter  60 value 227.849145
## iter  70 value 185.171621
## iter  80 value 154.124452
## iter  90 value 120.894107
## iter 100 value 111.954712
## iter 110 value 108.216812
## iter 120 value 98.013405
## iter 130 value 96.088570
## iter 140 value 83.754969
## iter 150 value 78.740827
## iter 160 value 78.504604
## iter 170 value 76.762121
## iter 180 value 75.933718
## iter 190 value 75.587031
## iter 200 value 75.460884
## iter 210 value 75.290267
## iter 220 value 75.038743
## iter 230 value 74.857537
## iter 240 value 74.741840
## iter 250 value 74.680820
## iter 260 value 74.575207
## iter 270 value 74.523280
## iter 280 value 74.474206
## iter 290 value 74.444631
## iter 300 value 74.428090
## iter 310 value 74.417655
## iter 320 value 74.414468
## final  value 74.413924 
## converged
## # weights:  493
## initial  value 3918.185245 
## iter  10 value 486.028782
## iter  20 value 279.315025
## iter  30 value 279.259831
## iter  40 value 266.591781
## iter  50 value 141.136492
## iter  60 value 118.826191
## iter  70 value 111.363540
## iter  80 value 110.004602
## iter  90 value 107.282691
## iter 100 value 103.667818
## iter 110 value 103.122002
## iter 120 value 102.484526
## iter 130 value 101.440767
## iter 140 value 101.268829
## iter 150 value 101.210216
## iter 160 value 101.172827
## iter 170 value 101.148679
## iter 180 value 101.092514
## iter 190 value 101.048613
## iter 200 value 100.052470
## iter 210 value 99.119723
## iter 220 value 93.278212
## iter 230 value 83.405028
## iter 240 value 78.398126
## iter 250 value 76.796786
## iter 260 value 75.850506
## iter 270 value 75.569469
## iter 280 value 75.246808
## iter 290 value 74.813042
## iter 300 value 74.552092
## iter 310 value 74.273709
## iter 320 value 74.107693
## iter 330 value 73.984957
## iter 340 value 73.925573
## iter 350 value 73.891323
## iter 360 value 73.866210
## iter 370 value 73.850151
## iter 380 value 73.819813
## iter 390 value 73.787068
## iter 400 value 73.777305
## iter 410 value 73.772663
## iter 420 value 73.770963
## iter 430 value 73.769229
## iter 440 value 73.764571
## iter 450 value 73.759515
## iter 460 value 73.757367
## iter 470 value 73.755029
## iter 480 value 73.753332
## iter 490 value 73.752648
## iter 500 value 73.751550
## iter 510 value 73.748931
## iter 520 value 73.743085
## iter 530 value 73.740460
## iter 540 value 73.739033
## iter 550 value 73.737081
## iter 560 value 73.731067
## iter 570 value 73.726703
## iter 580 value 73.725166
## iter 590 value 73.723397
## iter 600 value 73.722063
## iter 610 value 73.721044
## iter 620 value 73.719210
## iter 630 value 73.716178
## iter 640 value 73.713317
## iter 650 value 73.712415
## iter 660 value 73.711968
## iter 670 value 73.711753
## iter 680 value 73.711536
## iter 690 value 73.711237
## iter 700 value 73.710904
## iter 710 value 73.710646
## iter 720 value 73.710457
## iter 730 value 73.710293
## iter 740 value 73.709635
## iter 750 value 73.708838
## iter 760 value 73.708461
## iter 770 value 73.708280
## iter 780 value 73.708205
## iter 790 value 73.708168
## final  value 73.708117 
## converged
## # weights:  575
## initial  value 856.830359 
## iter  10 value 303.242495
## iter  20 value 280.561889
## iter  30 value 279.030245
## iter  40 value 269.517519
## iter  50 value 269.032389
## iter  60 value 120.223734
## iter  70 value 95.973419
## iter  80 value 93.673604
## iter  90 value 90.080410
## iter 100 value 77.441527
## iter 110 value 76.626343
## iter 120 value 75.967978
## iter 130 value 75.346300
## iter 140 value 74.855825
## iter 150 value 74.815169
## iter 160 value 74.640302
## iter 170 value 74.627598
## iter 180 value 74.538465
## iter 190 value 74.374392
## iter 200 value 74.287632
## iter 210 value 74.181262
## iter 220 value 74.106834
## iter 230 value 74.042370
## iter 240 value 74.007708
## iter 250 value 73.981396
## iter 260 value 73.934788
## iter 270 value 73.880248
## iter 280 value 73.829476
## iter 290 value 73.796932
## iter 300 value 73.780359
## iter 310 value 73.757690
## iter 320 value 73.729119
## iter 330 value 73.718508
## iter 340 value 73.714161
## iter 350 value 73.708977
## iter 360 value 73.700103
## iter 370 value 73.685961
## iter 380 value 73.679062
## iter 390 value 73.676457
## iter 400 value 73.675413
## iter 410 value 73.674642
## iter 420 value 73.674139
## iter 430 value 73.673629
## iter 440 value 73.673197
## iter 450 value 73.672897
## iter 460 value 73.672458
## iter 470 value 73.671863
## iter 480 value 73.671464
## iter 490 value 73.671112
## iter 500 value 73.670765
## iter 510 value 73.670351
## iter 520 value 73.669855
## iter 530 value 73.669133
## iter 540 value 73.667231
## iter 550 value 73.666515
## iter 560 value 73.666113
## iter 570 value 73.665806
## iter 580 value 73.665567
## iter 590 value 73.665396
## iter 600 value 73.665129
## iter 610 value 73.664829
## iter 620 value 73.664648
## iter 630 value 73.664594
## iter 640 value 73.664558
## iter 650 value 73.664517
## iter 660 value 73.664423
## iter 670 value 73.664304
## iter 680 value 73.664262
## iter 690 value 73.664245
## iter 700 value 73.664231
## final  value 73.664222 
## converged
## # weights:  657
## initial  value 8213.431926 
## iter  10 value 1023.744300
## iter  20 value 312.717559
## iter  30 value 279.310931
## iter  40 value 278.662042
## iter  50 value 140.773082
## iter  60 value 104.261101
## iter  70 value 102.328922
## iter  80 value 101.616002
## iter  90 value 101.325750
## iter 100 value 101.270815
## iter 110 value 101.209601
## iter 120 value 101.177212
## iter 130 value 101.168997
## iter 140 value 101.167009
## iter 150 value 101.162859
## iter 160 value 101.157822
## iter 170 value 101.143740
## iter 180 value 100.342648
## iter 190 value 90.812854
## iter 200 value 83.593017
## iter 210 value 79.710753
## iter 220 value 77.390735
## iter 230 value 77.053964
## iter 240 value 76.524590
## iter 250 value 76.233476
## iter 260 value 75.882305
## iter 270 value 75.469594
## iter 280 value 75.321457
## iter 290 value 75.275697
## iter 300 value 75.161241
## iter 310 value 74.968066
## iter 320 value 74.892062
## iter 330 value 74.733082
## iter 340 value 74.687625
## iter 350 value 74.637374
## iter 360 value 74.520189
## iter 370 value 74.442273
## iter 380 value 74.354015
## iter 390 value 74.290729
## iter 400 value 74.231759
## iter 410 value 74.161868
## iter 420 value 74.111125
## iter 430 value 74.084695
## iter 440 value 74.073395
## iter 450 value 74.068042
## iter 460 value 74.063168
## iter 470 value 74.043799
## iter 480 value 73.956295
## iter 490 value 73.883420
## iter 500 value 73.855568
## iter 510 value 73.787222
## iter 520 value 73.754731
## iter 530 value 73.732003
## iter 540 value 73.718782
## iter 550 value 73.706727
## iter 560 value 73.701852
## iter 570 value 73.700531
## iter 580 value 73.700050
## iter 590 value 73.699918
## iter 600 value 73.699849
## iter 610 value 73.699639
## iter 620 value 73.699561
## final  value 73.699559 
## converged
## # weights:  739
## initial  value 5619.309319 
## iter  10 value 432.746052
## iter  20 value 279.257763
## iter  30 value 279.255350
## iter  40 value 279.206678
## iter  50 value 278.152539
## iter  60 value 273.417339
## iter  70 value 120.729187
## iter  80 value 104.208886
## iter  90 value 102.314156
## iter 100 value 101.819122
## iter 110 value 101.700410
## iter 120 value 101.342780
## iter 130 value 101.057719
## iter 140 value 100.826763
## iter 150 value 100.196549
## iter 160 value 98.731056
## iter 170 value 94.863541
## iter 180 value 87.505562
## iter 190 value 81.468478
## iter 200 value 78.318468
## iter 210 value 76.622601
## iter 220 value 75.799288
## iter 230 value 75.434286
## iter 240 value 75.085966
## iter 250 value 74.943407
## iter 260 value 74.857755
## iter 270 value 74.809663
## iter 280 value 74.773897
## iter 290 value 74.745526
## iter 300 value 74.685286
## iter 310 value 74.627948
## iter 320 value 74.575417
## iter 330 value 74.523726
## iter 340 value 74.486104
## iter 350 value 74.448150
## iter 360 value 74.417340
## iter 370 value 74.383648
## iter 380 value 74.354355
## iter 390 value 74.332541
## iter 400 value 74.315491
## iter 410 value 74.305315
## iter 420 value 74.298362
## iter 430 value 74.290933
## iter 440 value 74.283319
## iter 450 value 74.272259
## iter 460 value 74.248703
## iter 470 value 74.221653
## iter 480 value 74.190295
## iter 490 value 74.160033
## iter 500 value 74.129850
## iter 510 value 74.092338
## iter 520 value 74.068096
## iter 530 value 74.042616
## iter 540 value 74.004763
## iter 550 value 73.965977
## iter 560 value 73.938996
## iter 570 value 73.918877
## iter 580 value 73.901707
## iter 590 value 73.887226
## iter 600 value 73.875345
## iter 610 value 73.868081
## iter 620 value 73.856506
## iter 630 value 73.842278
## iter 640 value 73.837985
## iter 650 value 73.803026
## iter 660 value 73.759257
## iter 670 value 73.742592
## iter 680 value 73.741959
## final  value 73.741905 
## converged
## # weights:  821
## initial  value 354.911611 
## iter  10 value 279.296691
## iter  20 value 279.249313
## iter  30 value 279.239328
## iter  40 value 279.214994
## iter  50 value 279.097400
## iter  60 value 278.797605
## iter  70 value 277.322989
## iter  80 value 242.279349
## iter  90 value 156.952647
## iter 100 value 128.923047
## iter 110 value 116.278606
## iter 120 value 96.627689
## iter 130 value 93.515381
## iter 140 value 83.622621
## iter 150 value 80.496865
## iter 160 value 76.975480
## iter 170 value 76.490876
## iter 180 value 76.268197
## iter 190 value 75.940281
## iter 200 value 75.818712
## iter 210 value 75.758856
## iter 220 value 75.606596
## iter 230 value 75.571121
## iter 240 value 75.490892
## iter 250 value 75.393535
## iter 260 value 75.381588
## iter 270 value 75.353841
## iter 280 value 75.306527
## iter 290 value 75.271367
## iter 300 value 75.249236
## iter 310 value 75.241054
## iter 320 value 75.194880
## iter 330 value 75.145196
## iter 340 value 75.050598
## iter 350 value 74.962015
## iter 360 value 74.934005
## iter 370 value 74.856557
## iter 380 value 74.839257
## iter 390 value 74.789212
## iter 400 value 74.632425
## iter 410 value 74.396897
## iter 420 value 74.136796
## iter 430 value 73.992135
## iter 440 value 73.919512
## iter 450 value 73.859808
## iter 460 value 73.813955
## iter 470 value 73.783726
## iter 480 value 73.752009
## iter 490 value 73.741267
## iter 500 value 73.734490
## iter 510 value 73.729677
## iter 520 value 73.727028
## iter 530 value 73.725439
## iter 540 value 73.724424
## iter 550 value 73.723630
## iter 560 value 73.723109
## iter 570 value 73.722556
## iter 580 value 73.721678
## iter 590 value 73.719909
## iter 600 value 73.715969
## iter 610 value 73.713867
## iter 620 value 73.712862
## iter 630 value 73.712342
## iter 640 value 73.712070
## iter 650 value 73.711746
## iter 660 value 73.711234
## iter 670 value 73.710307
## iter 680 value 73.709223
## iter 690 value 73.708417
## iter 700 value 73.707798
## iter 710 value 73.707240
## iter 720 value 73.706742
## iter 730 value 73.706363
## iter 740 value 73.706035
## iter 750 value 73.705557
## iter 760 value 73.705074
## iter 770 value 73.704803
## iter 780 value 73.704689
## iter 790 value 73.704607
## iter 800 value 73.704468
## iter 810 value 73.704214
## iter 820 value 73.704035
## iter 830 value 73.703750
## iter 840 value 73.703392
## iter 850 value 73.702994
## iter 860 value 73.702825
## iter 870 value 73.702663
## iter 880 value 73.702501
## iter 890 value 73.702338
## iter 900 value 73.702137
## iter 910 value 73.701954
## iter 920 value 73.701647
## iter 930 value 73.701207
## iter 940 value 73.700854
## iter 950 value 73.700541
## iter 960 value 73.700301
## iter 970 value 73.700046
## iter 980 value 73.699853
## iter 990 value 73.699601
## iter1000 value 73.699388
## final  value 73.699388 
## stopped after 1000 iterations
nnet_MSE2
##  [1] 5504.759 5391.998 5577.465 5419.372 5498.963 5439.971 5416.667
##  [8] 5423.153 5421.526 5438.779
nnet_MSE3 <- vector()

for(i in 1:10){
  reg_nnet3 <- nnet(registered/1000 ~ year + wday + factor(weather) + atemp , data = day_train, size = (2*i), maxit = 1000, decay = 0.1)

  predict3_reg <- predict(reg_nnet3, day_test, type ="raw") * 1000
  nnet_MSE3[i] <- mean((predict3_reg - day_test$registered)^2)
  
}
## # weights:  15
## initial  value 2676.656939 
## iter  10 value 299.990070
## iter  20 value 279.561892
## iter  30 value 279.328711
## iter  40 value 279.041973
## iter  50 value 277.625684
## iter  60 value 265.331563
## iter  70 value 246.631694
## iter  80 value 244.427093
## iter  90 value 244.188558
## iter 100 value 244.139011
## iter 110 value 244.137610
## final  value 244.137599 
## converged
## # weights:  29
## initial  value 4201.146965 
## iter  10 value 571.770907
## iter  20 value 279.516002
## iter  30 value 278.913038
## iter  40 value 255.043617
## iter  50 value 245.121214
## iter  60 value 244.154609
## iter  70 value 244.138219
## iter  80 value 244.133787
## final  value 244.133710 
## converged
## # weights:  43
## initial  value 3900.442444 
## iter  10 value 403.818109
## iter  20 value 279.323836
## final  value 279.323745 
## converged
## # weights:  57
## initial  value 2082.027220 
## iter  10 value 471.384347
## iter  20 value 279.283500
## iter  30 value 279.230008
## iter  40 value 262.930911
## iter  50 value 246.153275
## iter  60 value 244.256800
## iter  70 value 244.156278
## iter  80 value 244.124262
## iter  90 value 244.123976
## iter 100 value 244.123479
## iter 110 value 244.123158
## iter 120 value 244.122488
## iter 130 value 244.122119
## iter 140 value 244.121690
## iter 150 value 244.121556
## final  value 244.121543 
## converged
## # weights:  71
## initial  value 1873.665514 
## iter  10 value 344.860743
## iter  20 value 280.430772
## iter  30 value 279.296567
## iter  40 value 279.266043
## iter  50 value 278.897444
## iter  60 value 269.745336
## iter  70 value 249.720006
## iter  80 value 244.637661
## iter  90 value 244.315855
## iter 100 value 244.164056
## iter 110 value 244.127910
## iter 120 value 244.122174
## iter 130 value 244.121995
## iter 140 value 244.121803
## iter 150 value 244.121657
## final  value 244.121543 
## converged
## # weights:  85
## initial  value 284.014497 
## iter  10 value 279.258674
## iter  20 value 279.065798
## iter  30 value 272.183273
## iter  40 value 248.121113
## iter  50 value 244.523989
## iter  60 value 244.283802
## iter  70 value 244.162299
## iter  80 value 244.152285
## iter  90 value 244.137724
## iter 100 value 244.134700
## iter 110 value 244.125928
## iter 120 value 244.120448
## iter 130 value 244.119017
## iter 140 value 244.118670
## iter 150 value 244.118454
## iter 160 value 244.118346
## final  value 244.118260 
## converged
## # weights:  99
## initial  value 2232.907473 
## iter  10 value 551.695703
## iter  20 value 279.257987
## iter  30 value 279.250800
## iter  40 value 279.169374
## iter  50 value 254.933700
## iter  60 value 245.790964
## iter  70 value 244.323619
## iter  80 value 244.176755
## iter  90 value 244.123613
## final  value 244.123292 
## converged
## # weights:  113
## initial  value 281.821425 
## iter  10 value 279.253244
## iter  20 value 279.091271
## iter  30 value 278.727077
## iter  40 value 259.414438
## iter  50 value 250.990656
## iter  60 value 246.184833
## iter  70 value 244.146826
## iter  80 value 244.138312
## iter  90 value 244.123032
## iter 100 value 244.122630
## final  value 244.122558 
## converged
## # weights:  127
## initial  value 1288.958759 
## iter  10 value 394.751135
## iter  20 value 281.576107
## iter  30 value 278.943562
## iter  40 value 257.573785
## iter  50 value 244.982233
## iter  60 value 244.475798
## iter  70 value 244.233752
## iter  80 value 244.128046
## iter  90 value 244.120746
## iter 100 value 244.120638
## iter 110 value 244.120348
## iter 120 value 244.120081
## iter 130 value 244.119759
## final  value 244.119661 
## converged
## # weights:  141
## initial  value 7675.880627 
## iter  10 value 949.855056
## iter  20 value 291.678038
## iter  30 value 279.393832
## iter  40 value 279.262601
## iter  50 value 279.190621
## iter  60 value 278.758746
## iter  70 value 268.910041
## iter  80 value 259.357362
## iter  90 value 248.272663
## iter 100 value 244.568478
## iter 110 value 244.246029
## iter 120 value 244.130022
## iter 130 value 244.120722
## iter 140 value 244.119848
## final  value 244.119744 
## converged
nnet_MSE3
##  [1] 19690.31 19690.10 22801.62 19689.10 19689.71 19688.81 19689.67
##  [8] 19689.34 19689.61 19689.63

I scale my variables as otherwise test output remains constant. I do this by dividing by registered users by 1000 to get it in the 0-1 range and then multiply my predicted values by 1000 later.

For the final neural network I chose the variables working day and year for the time variables, and atemp and weather for the temp variables. From previous questions we have seen these play an important role in predicted the total number of registered users at any given time.

Comparing out three models we see that the smallest MSE is for the model which used only time variables and the largest is for the model with mixed variables (the one I chose). This seems to imply that registered users are more versatile when it comes to changes in weather on a daily basis but plan their commute across the year so as to avoid biking in the winter.

Problem 4

library(leaps)

regfit.fwd <- regsubsets (registered ~ . - casual, data=day ,nvmax=10, method ="forward")
reg_fwd_sum <- summary(regfit.fwd)
reg_fwd_sum
## Subset selection object
## Call: regsubsets.formula(registered ~ . - casual, data = day, nvmax = 10, 
##     method = "forward")
## 10 Variables  (and intercept)
##           Forced in Forced out
## season        FALSE      FALSE
## year          FALSE      FALSE
## month         FALSE      FALSE
## wday          FALSE      FALSE
## hr            FALSE      FALSE
## temp          FALSE      FALSE
## atemp         FALSE      FALSE
## hum           FALSE      FALSE
## windspeed     FALSE      FALSE
## weather       FALSE      FALSE
## 1 subsets of each size up to 10
## Selection Algorithm: forward
##           season year month wday hr  temp atemp hum windspeed weather
## 1  ( 1 )  " "    " "  " "   " "  "*" " "  " "   " " " "       " "    
## 2  ( 1 )  " "    " "  " "   " "  "*" "*"  " "   " " " "       " "    
## 3  ( 1 )  " "    "*"  " "   " "  "*" "*"  " "   " " " "       " "    
## 4  ( 1 )  " "    "*"  " "   " "  "*" "*"  " "   "*" " "       " "    
## 5  ( 1 )  "*"    "*"  " "   " "  "*" "*"  " "   "*" " "       " "    
## 6  ( 1 )  "*"    "*"  " "   "*"  "*" "*"  " "   "*" " "       " "    
## 7  ( 1 )  "*"    "*"  " "   "*"  "*" "*"  "*"   "*" " "       " "    
## 8  ( 1 )  "*"    "*"  " "   "*"  "*" "*"  "*"   "*" "*"       " "    
## 9  ( 1 )  "*"    "*"  " "   "*"  "*" "*"  "*"   "*" "*"       "*"    
## 10  ( 1 ) "*"    "*"  "*"   "*"  "*" "*"  "*"   "*" "*"       "*"
regfit.bwd <- regsubsets (registered ~ . - casual , data=day ,nvmax=10, method ="backward")
reg_bwd_sum <- summary(regfit.bwd)
reg_bwd_sum
## Subset selection object
## Call: regsubsets.formula(registered ~ . - casual, data = day, nvmax = 10, 
##     method = "backward")
## 10 Variables  (and intercept)
##           Forced in Forced out
## season        FALSE      FALSE
## year          FALSE      FALSE
## month         FALSE      FALSE
## wday          FALSE      FALSE
## hr            FALSE      FALSE
## temp          FALSE      FALSE
## atemp         FALSE      FALSE
## hum           FALSE      FALSE
## windspeed     FALSE      FALSE
## weather       FALSE      FALSE
## 1 subsets of each size up to 10
## Selection Algorithm: backward
##           season year month wday hr  temp atemp hum windspeed weather
## 1  ( 1 )  " "    " "  " "   " "  "*" " "  " "   " " " "       " "    
## 2  ( 1 )  " "    " "  " "   " "  "*" " "  "*"   " " " "       " "    
## 3  ( 1 )  " "    "*"  " "   " "  "*" " "  "*"   " " " "       " "    
## 4  ( 1 )  " "    "*"  " "   " "  "*" " "  "*"   "*" " "       " "    
## 5  ( 1 )  "*"    "*"  " "   " "  "*" " "  "*"   "*" " "       " "    
## 6  ( 1 )  "*"    "*"  " "   "*"  "*" " "  "*"   "*" " "       " "    
## 7  ( 1 )  "*"    "*"  " "   "*"  "*" " "  "*"   "*" "*"       " "    
## 8  ( 1 )  "*"    "*"  " "   "*"  "*" " "  "*"   "*" "*"       "*"    
## 9  ( 1 )  "*"    "*"  " "   "*"  "*" "*"  "*"   "*" "*"       "*"    
## 10  ( 1 ) "*"    "*"  "*"   "*"  "*" "*"  "*"   "*" "*"       "*"
plot(reg_fwd_sum$rss ,xlab="Number of Variables with Forward Selection",ylab="RSS", type="l")

plot(reg_fwd_sum$adjr2 ,xlab="Number of Variables with Forward Selection", ylab="Adjusted RSq",type="l")

plot(reg_bwd_sum$rss ,xlab="Number of Variables with Backward Selection",ylab="RSS", type="l")

plot(reg_bwd_sum$adjr2 ,xlab="Number of Variables with Backward Selection", ylab="Adjusted RSq",type="l")

From our plots we see that the best model is for 6 variables with both forward and backward subset selection. The 6 variables in this model are the same for both models and include - season, year, wday, hr, atemp and hum.

Problem 5

# a

library(readr)

covtype_train <- read_csv("covtype_train.csv")
## Warning: Missing column names filled in: 'X1' [1]
## Parsed with column specification:
## cols(
##   .default = col_integer()
## )
## See spec(...) for full column specifications.
covtype_test <- read_csv("covtype_test.csv")
## Warning: Missing column names filled in: 'X1' [1]
## Parsed with column specification:
## cols(
##   .default = col_integer()
## )
## See spec(...) for full column specifications.
covtype_train$X1 <- NULL
covtype_test$X1 <- NULL

covtype_train$cover[covtype_train$cover == 2] <- 0
covtype_train$cover[covtype_train$cover == 3] <- 1

covtype_test$cover[covtype_test$cover == 2] <- 0
covtype_test$cover[covtype_test$cover == 3] <- 1

glm_cover <- glm(as.factor(cover) ~ ., data = covtype_train, family = "binomial")
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
summary(glm_cover)
## 
## Call:
## glm(formula = as.factor(cover) ~ ., family = "binomial", data = covtype_train)
## 
## Deviance Residuals: 
##    Min      1Q  Median      3Q     Max  
## -3.408   0.000   0.000   0.000   2.847  
## 
## Coefficients: (4 not defined because of singularities)
##                Estimate Std. Error    z value Pr(>|z|)    
## (Intercept)  -1.093e+12  4.232e+12 -2.580e-01  0.79626    
## elev         -1.640e-02  1.194e-03 -1.373e+01  < 2e-16 ***
## aspect        6.926e-04  1.071e-03  6.470e-01  0.51785    
## slope        -4.658e-02  3.672e-02 -1.269e+00  0.20462    
## h_dist_hydro  5.546e-03  8.090e-04  6.856e+00 7.08e-12 ***
## v_dist_hydro  9.493e-04  1.947e-03  4.880e-01  0.62581    
## h_dist_road   4.021e-04  1.349e-04  2.982e+00  0.00287 ** 
## hillshade_9  -6.072e-02  3.699e-02 -1.642e+00  0.10067    
## hillshade_12  6.440e-02  3.171e-02  2.031e+00  0.04226 *  
## hillshade_3  -6.445e-02  3.134e-02 -2.057e+00  0.03970 *  
## h_dist_fire   1.038e-04  1.487e-04  6.980e-01  0.48508    
## wild1        -2.384e+01  1.975e+04 -1.000e-03  0.99904    
## wild2        -4.504e+15  4.115e+06 -1.095e+09  < 2e-16 ***
## wild3        -5.379e-01  3.346e-01 -1.608e+00  0.10792    
## wild4                NA         NA         NA       NA    
## soil1         1.093e+12  4.232e+12  2.580e-01  0.79626    
## soil2         1.093e+12  4.232e+12  2.580e-01  0.79626    
## soil3         1.093e+12  4.232e+12  2.580e-01  0.79626    
## soil4         1.093e+12  4.232e+12  2.580e-01  0.79626    
## soil5         1.093e+12  4.232e+12  2.580e-01  0.79626    
## soil6         1.093e+12  4.232e+12  2.580e-01  0.79626    
## soil7         1.093e+12  4.232e+12  2.580e-01  0.79626    
## soil8         1.093e+12  4.232e+12  2.580e-01  0.79626    
## soil9         1.093e+12  4.232e+12  2.580e-01  0.79626    
## soil10        1.093e+12  4.232e+12  2.580e-01  0.79626    
## soil11        1.093e+12  4.232e+12  2.580e-01  0.79626    
## soil12        1.093e+12  4.232e+12  2.580e-01  0.79626    
## soil13        1.093e+12  4.232e+12  2.580e-01  0.79626    
## soil14        1.093e+12  4.232e+12  2.580e-01  0.79626    
## soil15               NA         NA         NA       NA    
## soil16        1.093e+12  4.232e+12  2.580e-01  0.79626    
## soil17        1.093e+12  4.232e+12  2.580e-01  0.79626    
## soil18        1.093e+12  4.232e+12  2.580e-01  0.79626    
## soil19        1.093e+12  4.232e+12  2.580e-01  0.79626    
## soil20        1.093e+12  4.232e+12  2.580e-01  0.79626    
## soil21        1.093e+12  4.232e+12  2.580e-01  0.79626    
## soil22        1.093e+12  4.232e+12  2.580e-01  0.79626    
## soil23        1.093e+12  4.232e+12  2.580e-01  0.79626    
## soil24        1.093e+12  4.232e+12  2.580e-01  0.79626    
## soil25        1.093e+12  4.232e+12  2.580e-01  0.79626    
## soil26       -4.503e+15  4.232e+12 -1.064e+03  < 2e-16 ***
## soil27        1.093e+12  4.232e+12  2.580e-01  0.79626    
## soil28        1.093e+12  4.232e+12  2.580e-01  0.79626    
## soil29       -4.503e+15  4.232e+12 -1.064e+03  < 2e-16 ***
## soil30        1.093e+12  4.232e+12  2.580e-01  0.79626    
## soil31        1.093e+12  4.232e+12  2.580e-01  0.79626    
## soil32        1.093e+12  4.232e+12  2.580e-01  0.79626    
## soil33        1.093e+12  4.232e+12  2.580e-01  0.79626    
## soil34        1.093e+12  4.232e+12  2.580e-01  0.79626    
## soil35               NA         NA         NA       NA    
## soil36        1.093e+12  4.232e+12  2.580e-01  0.79626    
## soil37               NA         NA         NA       NA    
## soil38        1.093e+12  4.232e+12  2.580e-01  0.79626    
## soil39        1.093e+12  4.232e+12  2.580e-01  0.79626    
## soil40        1.093e+12  4.232e+12  2.580e-01  0.79626    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 6992.8  on 9999  degrees of freedom
## Residual deviance: 1109.5  on 9949  degrees of freedom
## AIC: 1211.5
## 
## Number of Fisher Scoring iterations: 25
cover_pred_train <- predict(glm_cover, covtype_train, type = "response")>0.5
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading
table(actual = covtype_train$cover, predicted_train = cover_pred_train)
##       predicted_train
## actual FALSE TRUE
##      0  8777  108
##      1   114 1001
cover_pred_test <- predict(glm_cover, covtype_test, type = "response")>0.5
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading
table(actual = covtype_test$cover, predicted_test = cover_pred_test)
##       predicted_test
## actual FALSE TRUE
##      0  8717  128
##      1   124 1031
# Using subset of significant predictors

glm_cover2 <- glm(as.factor(cover) ~ elev + h_dist_hydro + h_dist_road + hillshade_12 + hillshade_3 +
                    wild2 + soil26 + soil29, data = covtype_train, family = "binomial")
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
summary(glm_cover2)
## 
## Call:
## glm(formula = as.factor(cover) ~ elev + h_dist_hydro + h_dist_road + 
##     hillshade_12 + hillshade_3 + wild2 + soil26 + soil29, family = "binomial", 
##     data = covtype_train)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -3.2831  -0.0972  -0.0127   0.0000   3.2586  
## 
## Coefficients:
##                Estimate Std. Error z value Pr(>|z|)    
## (Intercept)   4.382e+01  1.775e+00  24.685  < 2e-16 ***
## elev         -2.135e-02  7.972e-04 -26.775  < 2e-16 ***
## h_dist_hydro  5.708e-03  4.303e-04  13.263  < 2e-16 ***
## h_dist_road   6.611e-04  8.764e-05   7.543 4.58e-14 ***
## hillshade_12  5.001e-02  4.176e-03  11.975  < 2e-16 ***
## hillshade_3  -1.867e-02  2.043e-03  -9.141  < 2e-16 ***
## wild2        -8.958e+00  9.211e+02  -0.010    0.992    
## soil26       -1.714e+01  1.985e+03  -0.009    0.993    
## soil29       -1.658e+01  2.954e+02  -0.056    0.955    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 6992.8  on 9999  degrees of freedom
## Residual deviance: 1807.5  on 9991  degrees of freedom
## AIC: 1825.5
## 
## Number of Fisher Scoring iterations: 19
cover_pred_train2 <- predict(glm_cover2, covtype_train, type = "response")>0.13
table(actual = covtype_train$cover, predicted_train = cover_pred_train2)
##       predicted_train
## actual FALSE TRUE
##      0  8332  553
##      1    73 1042
cover_pred_test2 <- predict(glm_cover2, covtype_test, type = "response")>0.13
table(actual = covtype_test$cover, predicted_test = cover_pred_test2)
##       predicted_test
## actual FALSE TRUE
##      0  8238  607
##      1    69 1086

I chose a threshold level of 0.13 and ran the logistic regression again on a subset of the significant predictods. This gives approximately the same sensitivity (6.55%) and specificity (6.22%) on the training data.

On the test data this same threshold gives us a sensitivity value of 5.97% and a specificity value of 6.86%.

Problem 6

library(e1071)
## Warning: package 'e1071' was built under R version 3.4.4
# Downsample to decrease SVM computation amount

# downsamp <- seq(1, ncol(covtype_train), 4)
# covtype_train2 <- covtype_train[ downsamp]
# # covtype_train2$train.y <- mnist_train$train.y
# covtype_test2 <- covtype_test[ downsamp]
# # mnist_test2$test.y <- covtype_test$test.y

covtype_train2 <- covtype_train
covtype_test2 <- covtype_test

# Drop soil variables to make computation easier

covtype_train2 <- covtype_train[ -c(15:54) ]
covtype_test2 <- covtype_test2[ -c(15:54) ]

# Run SVM

which(apply(covtype_train2, 2, var) == 0)
## named integer(0)
which(apply(covtype_test2, 2, var) == 0)
## named integer(0)
covtype_train = subset(covtype_train, select = -c(soil15,soil35,soil37) )
covtype_test = subset(covtype_test, select = -c(soil15,soil35,soil37))

# covtype_train2$cover <- covtype_train$cover
# covtype_train2$cover <- covtype_train$cover

tune_cover_radial <- tune.svm(as.factor(cover) ~ ., data = covtype_train2, kernel = "radial", gamma = c(1, 0.1, 0.01, 0.001), cost = c(1, 0.1, 0.01))

summary(tune_cover_radial)
## 
## Parameter tuning of 'svm':
## 
## - sampling method: 10-fold cross validation 
## 
## - best parameters:
##  gamma cost
##    0.1    1
## 
## - best performance: 0.0263 
## 
## - Detailed performance results:
##    gamma cost  error  dispersion
## 1  1.000 1.00 0.0301 0.005858517
## 2  0.100 1.00 0.0263 0.005812821
## 3  0.010 1.00 0.0335 0.006948221
## 4  0.001 1.00 0.0531 0.008912538
## 5  1.000 0.10 0.1015 0.007677529
## 6  0.100 0.10 0.0363 0.008179242
## 7  0.010 0.10 0.0531 0.008912538
## 8  0.001 0.10 0.0531 0.008912538
## 9  1.000 0.01 0.1115 0.009360081
## 10 0.100 0.01 0.0629 0.008999383
## 11 0.010 0.01 0.0531 0.008912538
## 12 0.001 0.01 0.1115 0.009360081
cover_radial_fit <- svm(as.factor(cover) ~ ., data = covtype_train, kernel = "radial", gamma = 0.1, cost=1)

# Training Error

table(true = covtype_train$cover, predicted = cover_radial_fit$fitted)
##     predicted
## true    0    1
##    0 8809   76
##    1   89 1026
# Test Error

pred_cover_radial_fit <- predict(cover_radial_fit , covtype_test)

table(true=covtype_train$cover, predicted = pred_cover_radial_fit)
##     predicted
## true    0    1
##    0 7863 1022
##    1  988  127

My computer was unable to run the SVM model for the whole dataset so I dropped the soil variables as they seem to have relatively little predictive power from our logistic regression in part 5. I completed the tuning process and then ran the best SVM with the full dataset.

The misclassification rate is 1.65% for the training data, but the test data misclassification rate is 20.1%. This indicates our SVM is overfitting the data and does not perform well on untrained data.

Problem 7

library(tree)
## Warning: package 'tree' was built under R version 3.4.4
# attach(covtype_test)
# Cover <- ifelse(Purchase == "MM",1,0)

tree.cover <- tree(as.factor(cover) ~ ., data = covtype_train)
summary(tree.cover)
## 
## Classification tree:
## tree(formula = as.factor(cover) ~ ., data = covtype_train)
## Variables actually used in tree construction:
## [1] "elev"  "wild1" "soil4" "soil2"
## Number of terminal nodes:  9 
## Residual mean deviance:  0.153 = 1529 / 9991 
## Misclassification error rate: 0.0248 = 248 / 10000
plot(tree.cover)
text(tree.cover ,pretty =0)

tree.cover.pred <- predict(tree.cover, covtype_test, type="class")
table(actual = covtype_test$cover, predicted = tree.cover.pred)
##       predicted
## actual    0    1
##      0 8741  104
##      1  191  964
# CV and Pruning

set.seed (3)
cv.tree.cover <- cv.tree(tree.cover ,FUN=prune.misclass )
cv.tree.cover
## $size
## [1] 9 8 6 5 3 2 1
## 
## $dev
## [1] 289 289 316 371 410 698 933
## 
## $k
## [1]  -Inf   0.0  15.5  43.0  58.5 280.0 396.0
## 
## $method
## [1] "misclass"
## 
## attr(,"class")
## [1] "prune"         "tree.sequence"
prune.tree.cover <- prune.misclass(tree.cover, best=8)
plot(prune.tree.cover)
text(prune.tree.cover, pretty=0)

tree.cover.pred2 <- predict(prune.tree.cover, covtype_test, type="class")
table(actual = covtype_test$cover, predicted = tree.cover.pred2)
##       predicted
## actual    0    1
##      0 8741  104
##      1  191  964

Classification error rate for pruned tree = (104 + 191 / 10000) = 2.95%

Problem 8

library(randomForest)
## Warning: package 'randomForest' was built under R version 3.4.4
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## The following object is masked from 'package:ggplot2':
## 
##     margin
randForest.cover <- randomForest(as.factor(cover) ~ . , data = covtype_train, mtry= 7 , ntree=20)
randForest.cover
## 
## Call:
##  randomForest(formula = as.factor(cover) ~ ., data = covtype_train,      mtry = 7, ntree = 20) 
##                Type of random forest: classification
##                      Number of trees: 20
## No. of variables tried at each split: 7
## 
##         OOB estimate of  error rate: 2.07%
## Confusion matrix:
##      0    1 class.error
## 0 8785   99  0.01114363
## 1  108 1007  0.09686099
importance(randForest.cover)
##              MeanDecreaseGini
## elev             3.849620e+02
## aspect           3.927054e+01
## slope            6.127884e+01
## h_dist_hydro     3.164131e+01
## v_dist_hydro     3.047529e+01
## h_dist_road      9.649719e+01
## hillshade_9      4.431769e+01
## hillshade_12     4.679115e+01
## hillshade_3      4.207037e+01
## h_dist_fire      7.276937e+01
## wild1            7.462885e+01
## wild2            7.962385e-01
## wild3            4.575683e+01
## wild4            3.655203e+02
## soil1            7.060504e+00
## soil2            1.370932e+02
## soil3            1.649194e+01
## soil4            1.332726e+02
## soil5            5.243146e+00
## soil6            4.317663e+01
## soil7            0.000000e+00
## soil8            0.000000e+00
## soil9            0.000000e+00
## soil10           5.912911e+01
## soil11           5.972824e+00
## soil12           8.267943e-01
## soil13           7.703536e+00
## soil14           4.285099e-01
## soil16           5.239645e-03
## soil17           8.606445e-01
## soil18           7.043711e-01
## soil19           4.306418e-02
## soil20           6.493282e-02
## soil21           0.000000e+00
## soil22           3.476340e+00
## soil23           4.797876e+00
## soil24           2.642385e+00
## soil25           0.000000e+00
## soil26           3.703704e-04
## soil27           1.549107e-02
## soil28           1.026579e+00
## soil29           4.885890e+00
## soil30           7.244359e-01
## soil31           2.022117e+00
## soil32           1.000551e+01
## soil33           1.818116e+01
## soil34           9.415283e-02
## soil36           0.000000e+00
## soil38           2.539277e-04
## soil39           1.735983e-01
## soil40           0.000000e+00
yhat.randC = predict(randForest.cover ,newdata = covtype_test)
table(actual = covtype_test$cover, predicted = yhat.randC)
##       predicted
## actual    0    1
##      0 8732  113
##      1  109 1046
# Important 10 variable model

randForest.cover2 <- randomForest(as.factor(cover) ~ elev + aspect + slope + h_dist_hydro + v_dist_hydro + h_dist_road + hillshade_9 + hillshade_12 + hillshade_3 + h_dist_fire , data = covtype_train, mtry= 3 , ntree=20)
randForest.cover2
## 
## Call:
##  randomForest(formula = as.factor(cover) ~ elev + aspect + slope +      h_dist_hydro + v_dist_hydro + h_dist_road + hillshade_9 +      hillshade_12 + hillshade_3 + h_dist_fire, data = covtype_train,      mtry = 3, ntree = 20) 
##                Type of random forest: classification
##                      Number of trees: 20
## No. of variables tried at each split: 3
## 
##         OOB estimate of  error rate: 2.7%
## Confusion matrix:
##      0   1 class.error
## 0 8788  95  0.01069459
## 1  175 940  0.15695067
yhat.randC2 = predict(randForest.cover2 ,newdata = covtype_test)
table(actual = covtype_test$cover, predicted = yhat.randC2)
##       predicted
## actual    0    1
##      0 8752   93
##      1  187  968

Note: For a random forest classification problem we use mtry = sqrt(p).

From our test data we see that the misclassification rate is 2.95% for random forest with 10 most important variables and 2.85% for the full model random forest, implying that the latter performs better even on untrained data.

Problem 9

library(dplyr)
## Warning: package 'dplyr' was built under R version 3.4.4
## 
## Attaching package: 'dplyr'
## The following object is masked from 'package:randomForest':
## 
##     combine
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
load("mnist_all.RData")

df_test <- as.data.frame(test$x)
df_test$Digit <- test$y

set.seed(2)
Q9_sample <- sample(1:nrow(df_test), 1000)

df_test_q9 <- df_test[Q9_sample,]

hc.complete <- hclust(dist(df_test_q9), method="complete")
plot(hc.complete, main="Complete Linkage" , xlab="", sub="",cex =.9)

df_test_q9$ten_clus <- cutree(hc.complete, 10)
df_test_q9$ten_clus2 <- cutree(hc.complete, 5)
df_test_q9$ten_clus3 <- cutree(hc.complete, 15)

# Cross Tab Digit and Cluster Values

table(Digit = df_test_q9$Digit, Cluster = df_test_q9$ten_clus)
##      Cluster
## Digit  1  2  3  4  5  6  7  8  9 10
##     0 15  8  0  1  0  0 42  0  0 31
##     1 54 43  0  2  0  0  0  0  0  0
##     2 12  4 12 76  0  0  1  0  2  0
##     3  5 55 23  1  1 10  8  0  6  0
##     4 51 43  0  0 12  0  0  0  0  0
##     5 27 31  5  0  1 16  3  0  1  0
##     6 14  9  0 39  1  0  0 28  0  0
##     7 62 12  0  1 45  0  1  0  0  0
##     8 38 37  1  1  0  3  1  0  5  0
##     9 40 44  0  0 15  0  1  0  0  0
table(Digit = df_test_q9$Digit, Cluster = df_test_q9$ten_clus2)
##      Cluster
## Digit  1  2  3  4  5
##     0 15  8  1 42 31
##     1 54 43  2  0  0
##     2 26  4 76  1  0
##     3 34 56  1 18  0
##     4 51 55  0  0  0
##     5 33 32  0 19  0
##     6 14 10 67  0  0
##     7 62 57  1  1  0
##     8 44 37  1  4  0
##     9 40 59  0  1  0
table(Digit = df_test_q9$Digit, Cluster = df_test_q9$ten_clus3)
##      Cluster
## Digit  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15
##     0  4  1  0  0  1  0  7  0  7 35  0  0  0 11 31
##     1 54 43  0  1  1  0  0  0  0  0  0  0  0  0  0
##     2 12  4 12 40 20  0  0  0  0  1 16  0  2  0  0
##     3  5 55 23  1  0  1  0 10  0  8  0  0  6  0  0
##     4 51 16  0  0  0 12 27  0  0  0  0  0  0  0  0
##     5 22 30  5  0  0  1  1 16  0  3  0  0  1  5  0
##     6 12  3  0  0 39  1  6  0  0  0  0 28  0  2  0
##     7 62 11  0  0  0 45  1  0  0  1  1  0  0  0  0
##     8 38 34  1  1  0  0  3  3  0  1  0  0  5  0  0
##     9 40 34  0  0  0 15 10  0  0  1  0  0  0  0  0
# Split dataset by cluster

# split(df_test_q9, df_test_q9$ten_clus)

# Calculate mean value for each column by cluster

cluster_means <- matrix(nrow = 10, ncol = 788)

cluster_means <- df_test_q9 %>% 
    group_by(ten_clus) %>%
    summarise_all("mean")

cluster_means <- as.data.frame(cluster_means)

# Create function to plot digits

plot_digit <- function(j){
  arr784 <- as.numeric(cluster_means[j,1:784])
  col=gray(12:1/12)
  image(matrix(arr784, nrow=28)[,28:1], col=col, 
        main = paste("this is from cluster ",cluster_means$labels[j]))
}

for(i in 1:10){
  plot_digit(i)
}

(b). We can cut the dendogram at the a height of around 3500 and then we would have approximately 10 clusters, which is what we know our data has. However if we look at the leaves we see that these numbers of these seem to be overlapping and they do not seem to be seperated correctly if we were to add a cut at height = 3500. In this case even if we have ten clusters there will be a large number of misclassifications. For this reason I feel that the dendogram does not have compelling evidence about the correct number of “clusters”.

(c).I used a cross-tabulation to see the most common digit for each cluster and see which digits tend to clump together. Using a hierarchical clustering of k = 10, we can see some clusters contain only one digit (such as digits 0 and 6), but no digit is perfectly seperated into its own cluster. Varying the level of k still leads to imperfect clustering, with cluster 1 conating the most variation in terms of digits.

I choose to use 10 clusters and then split the data by each cluster. After this I calculate the mean of each variable by the cluster to see what the average value of that pixel is for the cluster before plotting it.

Problem 10

library(ggfortify)
## Warning: package 'ggfortify' was built under R version 3.4.4
which(apply(df_test, 2, var) == 0)
##   V1   V2   V3   V4   V5   V6   V7   V8   V9  V10  V11  V12  V13  V14  V15 
##    1    2    3    4    5    6    7    8    9   10   11   12   13   14   15 
##  V16  V17  V18  V19  V20  V21  V22  V23  V24  V25  V26  V27  V28  V29  V30 
##   16   17   18   19   20   21   22   23   24   25   26   27   28   29   30 
##  V31  V32  V33  V50  V51  V52  V53  V54  V55  V56  V57  V58  V59  V60  V61 
##   31   32   33   50   51   52   53   54   55   56   57   58   59   60   61 
##  V82  V83  V84  V85  V86  V87  V88 V112 V113 V114 V115 V140 V141 V168 V169 
##   82   83   84   85   86   87   88  112  113  114  115  140  141  168  169 
## V170 V197 V225 V337 V364 V365 V392 V393 V394 V420 V421 V422 V449 V477 V478 
##  170  197  225  337  364  365  392  393  394  420  421  422  449  477  478 
## V505 V533 V561 V588 V589 V590 V616 V617 V618 V644 V645 V672 V673 V674 V675 
##  505  533  561  588  589  590  616  617  618  644  645  672  673  674  675 
## V699 V700 V701 V702 V703 V727 V728 V729 V730 V731 V732 V754 V755 V756 V757 
##  699  700  701  702  703  727  728  729  730  731  732  754  755  756  757 
## V758 V759 V760 V761 V762 V779 V780 V781 V782 V783 V784 
##  758  759  760  761  762  779  780  781  782  783  784
df_test <- df_test[ - as.numeric(which(apply(df_test, 2, var) == 0))]

# a


pr.mnist <- prcomp(df_test, scale=TRUE)

pr.var <- pr.mnist$sdev ^2

pve <- pr.var/sum(pr.var)
pve
##   [1] 6.199824e-02 4.261971e-02 4.041900e-02 3.226216e-02 2.771465e-02
##   [6] 2.408527e-02 2.044219e-02 1.880442e-02 1.670122e-02 1.515360e-02
##  [11] 1.470044e-02 1.310275e-02 1.253967e-02 1.192863e-02 1.143425e-02
##  [16] 1.093475e-02 1.031490e-02 9.927439e-03 9.457467e-03 9.102562e-03
##  [21] 8.882116e-03 8.659025e-03 8.467488e-03 8.244449e-03 7.854211e-03
##  [26] 7.708808e-03 7.557945e-03 7.325687e-03 7.078694e-03 6.824527e-03
##  [31] 6.677280e-03 6.532530e-03 6.462752e-03 6.212479e-03 6.034448e-03
##  [36] 5.917179e-03 5.812251e-03 5.723904e-03 5.648266e-03 5.521146e-03
##  [41] 5.510552e-03 5.472601e-03 5.331640e-03 5.191785e-03 5.125071e-03
##  [46] 4.986125e-03 4.978993e-03 4.884685e-03 4.781582e-03 4.691985e-03
##  [51] 4.607179e-03 4.538054e-03 4.452305e-03 4.396033e-03 4.333757e-03
##  [56] 4.231124e-03 4.189762e-03 4.137308e-03 4.110474e-03 4.050560e-03
##  [61] 4.001875e-03 3.859565e-03 3.812457e-03 3.748760e-03 3.659602e-03
##  [66] 3.605534e-03 3.567652e-03 3.499439e-03 3.464697e-03 3.436632e-03
##  [71] 3.353172e-03 3.333730e-03 3.297193e-03 3.265675e-03 3.236521e-03
##  [76] 3.184420e-03 3.141815e-03 3.089541e-03 3.075042e-03 3.012812e-03
##  [81] 2.976570e-03 2.908319e-03 2.870665e-03 2.843684e-03 2.808937e-03
##  [86] 2.748883e-03 2.720344e-03 2.689560e-03 2.664859e-03 2.630816e-03
##  [91] 2.585594e-03 2.541446e-03 2.519489e-03 2.479986e-03 2.440511e-03
##  [96] 2.427887e-03 2.378737e-03 2.346959e-03 2.313464e-03 2.306443e-03
## [101] 2.281961e-03 2.263603e-03 2.228423e-03 2.213308e-03 2.156639e-03
## [106] 2.134596e-03 2.118705e-03 2.083810e-03 2.076818e-03 2.044565e-03
## [111] 2.042761e-03 2.025927e-03 1.968989e-03 1.958587e-03 1.941742e-03
## [116] 1.926854e-03 1.895644e-03 1.878429e-03 1.823603e-03 1.787200e-03
## [121] 1.776795e-03 1.724767e-03 1.714638e-03 1.682682e-03 1.668393e-03
## [126] 1.656691e-03 1.645439e-03 1.627467e-03 1.601676e-03 1.595241e-03
## [131] 1.584143e-03 1.561710e-03 1.553411e-03 1.533416e-03 1.522532e-03
## [136] 1.499068e-03 1.493024e-03 1.491014e-03 1.468708e-03 1.460821e-03
## [141] 1.450795e-03 1.436661e-03 1.409213e-03 1.394010e-03 1.377586e-03
## [146] 1.375125e-03 1.360772e-03 1.354752e-03 1.337095e-03 1.320273e-03
## [151] 1.302467e-03 1.295328e-03 1.290134e-03 1.281180e-03 1.270603e-03
## [156] 1.250429e-03 1.237924e-03 1.217015e-03 1.205446e-03 1.179075e-03
## [161] 1.170610e-03 1.160620e-03 1.151563e-03 1.134708e-03 1.124373e-03
## [166] 1.118873e-03 1.109930e-03 1.083950e-03 1.076432e-03 1.073541e-03
## [171] 1.063906e-03 1.057469e-03 1.034596e-03 1.022580e-03 1.012353e-03
## [176] 9.893284e-04 9.825657e-04 9.768988e-04 9.694012e-04 9.667898e-04
## [181] 9.522660e-04 9.419396e-04 9.353950e-04 9.248052e-04 9.213565e-04
## [186] 9.157441e-04 9.018511e-04 8.941395e-04 8.805470e-04 8.751303e-04
## [191] 8.679617e-04 8.528464e-04 8.480460e-04 8.445352e-04 8.331361e-04
## [196] 8.283672e-04 8.178630e-04 8.082486e-04 7.940563e-04 7.859470e-04
## [201] 7.844300e-04 7.790344e-04 7.681772e-04 7.600104e-04 7.572933e-04
## [206] 7.483803e-04 7.361123e-04 7.314506e-04 7.261947e-04 7.175517e-04
## [211] 7.108677e-04 7.023430e-04 6.977069e-04 6.837937e-04 6.797393e-04
## [216] 6.767524e-04 6.689022e-04 6.657317e-04 6.578015e-04 6.466329e-04
## [221] 6.401814e-04 6.395852e-04 6.332832e-04 6.283632e-04 6.227158e-04
## [226] 6.209587e-04 6.084430e-04 6.052650e-04 5.983929e-04 5.966517e-04
## [231] 5.905252e-04 5.859358e-04 5.787889e-04 5.751876e-04 5.729030e-04
## [236] 5.692548e-04 5.601560e-04 5.513799e-04 5.464380e-04 5.398128e-04
## [241] 5.364946e-04 5.329697e-04 5.310695e-04 5.244791e-04 5.220754e-04
## [246] 5.164440e-04 5.132614e-04 5.057191e-04 5.009332e-04 4.964596e-04
## [251] 4.926785e-04 4.851165e-04 4.827462e-04 4.785159e-04 4.698508e-04
## [256] 4.658311e-04 4.612191e-04 4.609344e-04 4.568334e-04 4.539229e-04
## [261] 4.520027e-04 4.478689e-04 4.425840e-04 4.393612e-04 4.361667e-04
## [266] 4.314865e-04 4.267199e-04 4.235610e-04 4.195761e-04 4.163677e-04
## [271] 4.133548e-04 4.070288e-04 4.049531e-04 4.015810e-04 3.981422e-04
## [276] 3.960722e-04 3.924997e-04 3.909682e-04 3.889275e-04 3.848352e-04
## [281] 3.817982e-04 3.794734e-04 3.729054e-04 3.676626e-04 3.666450e-04
## [286] 3.635195e-04 3.587359e-04 3.575298e-04 3.548252e-04 3.518989e-04
## [291] 3.478298e-04 3.438942e-04 3.419607e-04 3.392417e-04 3.389817e-04
## [296] 3.371644e-04 3.337319e-04 3.313944e-04 3.261888e-04 3.236395e-04
## [301] 3.212991e-04 3.186003e-04 3.141867e-04 3.128938e-04 3.116658e-04
## [306] 3.076821e-04 3.051521e-04 3.027696e-04 3.026339e-04 3.002148e-04
## [311] 2.975710e-04 2.925877e-04 2.911952e-04 2.893052e-04 2.871546e-04
## [316] 2.856044e-04 2.843520e-04 2.815630e-04 2.795801e-04 2.786229e-04
## [321] 2.756490e-04 2.735090e-04 2.698462e-04 2.679113e-04 2.673264e-04
## [326] 2.652687e-04 2.637343e-04 2.604485e-04 2.579550e-04 2.557514e-04
## [331] 2.542842e-04 2.507213e-04 2.488205e-04 2.464813e-04 2.444966e-04
## [336] 2.428019e-04 2.408209e-04 2.383759e-04 2.371553e-04 2.351999e-04
## [341] 2.333574e-04 2.319711e-04 2.312510e-04 2.276413e-04 2.258016e-04
## [346] 2.253496e-04 2.238731e-04 2.226528e-04 2.207859e-04 2.187684e-04
## [351] 2.170464e-04 2.164116e-04 2.148670e-04 2.131621e-04 2.125376e-04
## [356] 2.103897e-04 2.091137e-04 2.084667e-04 2.059472e-04 2.052837e-04
## [361] 2.030333e-04 2.019322e-04 1.991480e-04 1.985217e-04 1.972981e-04
## [366] 1.968001e-04 1.962477e-04 1.951256e-04 1.942610e-04 1.919671e-04
## [371] 1.901847e-04 1.896575e-04 1.879231e-04 1.873754e-04 1.860726e-04
## [376] 1.836784e-04 1.827440e-04 1.803624e-04 1.797033e-04 1.788469e-04
## [381] 1.780505e-04 1.774885e-04 1.765747e-04 1.740464e-04 1.729008e-04
## [386] 1.720882e-04 1.706243e-04 1.688837e-04 1.683369e-04 1.673550e-04
## [391] 1.663364e-04 1.644107e-04 1.638130e-04 1.628263e-04 1.620527e-04
## [396] 1.612499e-04 1.599876e-04 1.594671e-04 1.580445e-04 1.566543e-04
## [401] 1.560032e-04 1.554623e-04 1.546523e-04 1.541701e-04 1.519165e-04
## [406] 1.516484e-04 1.502332e-04 1.501497e-04 1.488084e-04 1.468952e-04
## [411] 1.457049e-04 1.452348e-04 1.446686e-04 1.432294e-04 1.425745e-04
## [416] 1.423528e-04 1.412281e-04 1.404112e-04 1.398222e-04 1.384594e-04
## [421] 1.383049e-04 1.367253e-04 1.365165e-04 1.354788e-04 1.346013e-04
## [426] 1.336797e-04 1.327818e-04 1.323492e-04 1.316456e-04 1.310451e-04
## [431] 1.307308e-04 1.292952e-04 1.287709e-04 1.279019e-04 1.274838e-04
## [436] 1.260758e-04 1.253907e-04 1.243856e-04 1.238101e-04 1.235593e-04
## [441] 1.229834e-04 1.226126e-04 1.215330e-04 1.207261e-04 1.201679e-04
## [446] 1.196728e-04 1.194362e-04 1.184591e-04 1.177226e-04 1.170609e-04
## [451] 1.158616e-04 1.154538e-04 1.146241e-04 1.135415e-04 1.132472e-04
## [456] 1.127118e-04 1.120691e-04 1.115591e-04 1.109031e-04 1.098425e-04
## [461] 1.091245e-04 1.089466e-04 1.083442e-04 1.078045e-04 1.074078e-04
## [466] 1.070459e-04 1.066884e-04 1.057040e-04 1.054539e-04 1.045120e-04
## [471] 1.036630e-04 1.032192e-04 1.027397e-04 1.021103e-04 1.017783e-04
## [476] 1.006155e-04 9.980013e-05 9.977945e-05 9.893263e-05 9.857210e-05
## [481] 9.761377e-05 9.723756e-05 9.657834e-05 9.623146e-05 9.564770e-05
## [486] 9.528142e-05 9.441901e-05 9.370821e-05 9.315463e-05 9.253826e-05
## [491] 9.240993e-05 9.200704e-05 9.189691e-05 9.087484e-05 9.002478e-05
## [496] 8.995657e-05 8.933572e-05 8.910549e-05 8.820004e-05 8.775689e-05
## [501] 8.743403e-05 8.737725e-05 8.700623e-05 8.612892e-05 8.542730e-05
## [506] 8.515577e-05 8.432578e-05 8.414354e-05 8.388064e-05 8.337563e-05
## [511] 8.260967e-05 8.197271e-05 8.148338e-05 8.115102e-05 8.101425e-05
## [516] 8.022001e-05 7.996207e-05 7.924425e-05 7.903869e-05 7.794296e-05
## [521] 7.785353e-05 7.665284e-05 7.638945e-05 7.616298e-05 7.598042e-05
## [526] 7.536193e-05 7.514461e-05 7.479517e-05 7.378056e-05 7.341338e-05
## [531] 7.322906e-05 7.284149e-05 7.216421e-05 7.182428e-05 7.148795e-05
## [536] 7.107164e-05 7.061075e-05 7.014635e-05 6.965115e-05 6.959718e-05
## [541] 6.933013e-05 6.921734e-05 6.851377e-05 6.839150e-05 6.775969e-05
## [546] 6.700042e-05 6.639283e-05 6.618965e-05 6.563734e-05 6.535309e-05
## [551] 6.504091e-05 6.446289e-05 6.416963e-05 6.390340e-05 6.316640e-05
## [556] 6.287200e-05 6.265388e-05 6.259333e-05 6.233603e-05 6.201612e-05
## [561] 6.126866e-05 6.036222e-05 6.017145e-05 5.993738e-05 5.965068e-05
## [566] 5.930204e-05 5.889339e-05 5.862538e-05 5.852972e-05 5.781498e-05
## [571] 5.754479e-05 5.732929e-05 5.632986e-05 5.621896e-05 5.592795e-05
## [576] 5.550705e-05 5.483720e-05 5.457221e-05 5.430270e-05 5.420401e-05
## [581] 5.393409e-05 5.364745e-05 5.318881e-05 5.276162e-05 5.235915e-05
## [586] 5.226991e-05 5.180868e-05 5.153274e-05 5.078186e-05 5.062402e-05
## [591] 5.018914e-05 4.997664e-05 4.953201e-05 4.910308e-05 4.901593e-05
## [596] 4.853694e-05 4.814239e-05 4.779870e-05 4.757900e-05 4.720262e-05
## [601] 4.702191e-05 4.648818e-05 4.624978e-05 4.597969e-05 4.540359e-05
## [606] 4.522464e-05 4.482383e-05 4.478329e-05 4.406685e-05 4.378868e-05
## [611] 4.363170e-05 4.338326e-05 4.287022e-05 4.274361e-05 4.231431e-05
## [616] 4.182496e-05 4.154823e-05 4.126938e-05 4.108689e-05 4.090979e-05
## [621] 4.035492e-05 4.000912e-05 3.956083e-05 3.922105e-05 3.900474e-05
## [626] 3.893287e-05 3.876845e-05 3.859532e-05 3.800569e-05 3.737744e-05
## [631] 3.724173e-05 3.701784e-05 3.662021e-05 3.649974e-05 3.566638e-05
## [636] 3.556832e-05 3.519172e-05 3.484690e-05 3.418231e-05 3.402460e-05
## [641] 3.354631e-05 3.303341e-05 3.285406e-05 3.245058e-05 3.198546e-05
## [646] 3.187752e-05 3.164642e-05 3.119449e-05 3.025397e-05 3.015705e-05
## [651] 2.943917e-05 2.875357e-05 2.803551e-05 2.764070e-05 2.722639e-05
## [656] 2.651587e-05 2.411464e-05 2.368025e-05 2.259321e-05 1.632515e-05
## [661] 2.418898e-06 7.977559e-07 1.749557e-29 1.281145e-29 6.932915e-32
## [666] 2.079024e-32 2.168930e-33 1.074094e-33 1.348970e-34
plot(pve, xlab="Principal Component", ylab="Proportion of Variance Explained ", ylim=c(0,1),type='b')

sum(pve[1:2])
## [1] 0.1046179
sum(pve[1:10])
## [1] 0.3002005
# b

df_test$colour <- as.factor(df_test$Digit)

autoplot(pr.mnist, data = df_test , colour = 'colour') + scale_color_manual(labels = c("0", "1", "2", "3","4", "5", "6", "7", "8", "9"), values = c("blue", "red", "yellow", "pink", "orange", "gray", "black", "green", "purple", "white" ))

# c

df_10c <- as.data.frame(pr.mnist$x[,1:2])
df_10c$Digit <- as.factor(df_test$Digit)

df_10c <- subset(df_10c, Digit == 1 | Digit == 2 | Digit == 4)

ggplot(data = df_10c, aes(PC1,PC2, colour = Digit)) + geom_point()

# d

df_10d <- as.data.frame(pr.mnist$x[,1:2])
df_10d$Digit <- as.factor(df_test$Digit)

df_10d <- subset(df_10d, Digit == 5 | Digit == 3 | Digit == 6)

ggplot(data = df_10d, aes(PC1,PC2, colour = Digit)) + geom_point()

(a). The first 2 Principle Components explain 10.5% of the total variation, whereas the first 10 Principle Components explain 30.0% of the variation in the data.